#Read the data frame "the world's happiness" and name it as "whr_alternate"
#whr_alternate=read.csv("/Users/zhenganlyu/Library/Mobile Documents/com~apple~CloudDocs/whr_alternate.csv")
whr_alternate=read.csv("/Users/zhenganlyu/Library/Mobile Documents/com~apple~CloudDocs/whr_alternate.csv")
#summarize and see structure of the data frame
whr_alternate$X = NULL #X shows the list number and it is not important
summary(whr_alternate) #We observed that there are 283 observations, 17 variables and some missing values
Country.name Year Life.Ladder Log.GDP.per.capita
Afghanistan: 2 Min. :2017 Min. :2.662 Min. : 6.494
Albania : 2 1st Qu.:2017 1st Qu.:4.650 1st Qu.: 8.426
Algeria : 2 Median :2017 Median :5.481 Median : 9.460
Argentina : 2 Mean :2017 Mean :5.480 Mean : 9.276
Armenia : 2 3rd Qu.:2018 3rd Qu.:6.276 3rd Qu.:10.237
Australia : 2 Max. :2018 Max. :7.858 Max. :11.454
(Other) :271 NA's :13
Social.support Healthy.life.expectancy.at.birth
Min. :0.3196 Min. :45.20
1st Qu.:0.7387 1st Qu.:59.00
Median :0.8304 Median :66.10
Mean :0.8078 Mean :64.42
3rd Qu.:0.9053 3rd Qu.:68.90
Max. :0.9845 Max. :76.80
NA's :1 NA's :6
Freedom.to.make.life.choices Generosity Perceptions.of.corruption
Min. :0.3735 Min. :-0.33638 Min. :0.09656
1st Qu.:0.7146 1st Qu.:-0.14592 1st Qu.:0.68211
Median :0.8038 Median :-0.03316 Median :0.79097
Mean :0.7825 Mean :-0.01741 Mean :0.73044
3rd Qu.:0.8793 3rd Qu.: 0.08673 3rd Qu.:0.85146
Max. :0.9852 Max. : 0.62958 Max. :0.95439
NA's :1 NA's :15 NA's :18
Positive.affect Negative.affect Confidence.in.national.government
Min. :0.4210 Min. :0.0927 Min. :0.07971
1st Qu.:0.6233 1st Qu.:0.2210 1st Qu.:0.33474
Median :0.7250 Median :0.2827 Median :0.47367
Mean :0.7072 Mean :0.2924 Mean :0.49879
3rd Qu.:0.7947 3rd Qu.:0.3556 3rd Qu.:0.63786
Max. :0.9028 Max. :0.5993 Max. :0.98812
NA's :2 NA's :2 NA's :26
Democratic.Quality Delivery.Quality
Min. :-2.3266 Min. :-2.01850
1st Qu.:-0.7043 1st Qu.:-0.67840
Median :-0.1644 Median :-0.17384
Mean :-0.1225 Mean :-0.00347
3rd Qu.: 0.6412 3rd Qu.: 0.68622
Max. : 1.5750 Max. : 2.06917
NA's :136 NA's :136
Standard.deviation.of.ladder.by.country.year
Min. :1.198
1st Qu.:1.858
Median :2.215
Mean :2.243
3rd Qu.:2.596
Max. :3.719
Standard.deviation.Mean.of.ladder.by.country.year
Min. :0.1654
1st Qu.:0.3123
Median :0.4076
Mean :0.4371
3rd Qu.:0.5474
Max. :0.9717
gini.of.household.income.reported.in.Gallup..by.wp5.year
Min. :0.2010
1st Qu.:0.3761
Median :0.4406
Mean :0.4628
3rd Qu.:0.5615
Max. :0.8520
NA's :4
str(whr_alternate)
'data.frame': 283 obs. of 17 variables:
$ Country.name : Factor w/ 152 levels "Afghanistan",..: 1 1 2 2 3 3 4 4 5 5 ...
$ Year : int 2017 2018 2017 2018 2017 2018 2017 2018 2017 2018 ...
$ Life.Ladder : num 2.66 2.69 4.64 5 5.25 ...
$ Log.GDP.per.capita : num 7.5 7.49 9.38 9.41 9.54 ...
$ Social.support : num 0.491 0.508 0.638 0.684 0.807 ...
$ Healthy.life.expectancy.at.birth : num 52.8 52.6 68.4 68.7 65.7 ...
$ Freedom.to.make.life.choices : num 0.427 0.374 0.75 0.824 0.437 ...
$ Generosity : num -0.1122 -0.08489 -0.03264 0.00538 -0.19152 ...
$ Perceptions.of.corruption : num 0.954 0.928 0.876 0.899 0.7 ...
$ Positive.affect : num 0.496 0.424 0.669 0.713 0.642 ...
$ Negative.affect : num 0.371 0.405 0.334 0.319 0.289 ...
$ Confidence.in.national.government : num 0.261 0.365 0.458 0.435 NA ...
$ Democratic.Quality : num -1.887 NA 0.3 NA -0.928 ...
$ Delivery.Quality : num -1.438 NA -0.13 NA -0.817 ...
$ Standard.deviation.of.ladder.by.country.year : num 1.45 1.41 2.68 2.64 2.04 ...
$ Standard.deviation.Mean.of.ladder.by.country.year : num 0.546 0.523 0.578 0.528 0.389 ...
$ gini.of.household.income.reported.in.Gallup..by.wp5.year: num 0.287 0.291 0.41 0.456 0.528 ...
whr_alternate1=na.omit(whr_alternate)
#Incorporated another data frame and name it as new data
#data <- data.frame(read_csv("/Users/zhenganlyu/Downloads/GDP-Data-Set.csv"))
data <- data.frame(read_csv("/Users/zhenganlyu/Downloads/GDP-Data-Set.csv"))
newdata = data[order(data$GDP.PPP.Per.Capita),]
#summarize and see structure of the new included data frame
summary(newdata)
X. Country.name Year GDP.PPP.Per.Capita
Min. : 1.00 Length:140 Min. :2017 Min. : 727
1st Qu.: 35.75 Class :character 1st Qu.:2017 1st Qu.: 4873
Median : 70.50 Mode :character Median :2017 Median : 14903
Mean : 70.50 Mean :2017 Mean : 21688
3rd Qu.:105.25 3rd Qu.:2017 3rd Qu.: 32729
Max. :140.00 Max. :2017 Max. :107641
Life.Ladder GDP.Per.Capita.Nominal
Min. :2.662 Min. : 357
1st Qu.:4.608 1st Qu.: 1562
Median :5.587 Median : 5769
Mean :5.481 Mean : 14830
3rd Qu.:6.278 3rd Qu.: 18825
Max. :7.788 Max. :105280
str(newdata)
'data.frame': 140 obs. of 6 variables:
$ X. : num 1 2 3 4 5 6 7 8 9 10 ...
$ Country.name : chr "Central African Republic" "Congo (Kinshasa)" "Niger" "Malawi" ...
$ Year : num 2017 2017 2017 2017 2017 ...
$ GDP.PPP.Per.Capita : num 727 889 1019 1205 1250 ...
$ Life.Ladder : num 3.48 4.31 4.62 3.42 4.28 ...
$ GDP.Per.Capita.Nominal: num 424 462 376 357 441 699 504 450 618 673 ...
Pranav Prabhas
whr_alternate$Year <- as.factor(whr_alternate$Year)
ggplot(whr_alternate, aes(x=Life.Ladder, color = Year, fill =Year, y=..density..)) +
geom_histogram(position="identity", bins = 25, alpha = 0.5) + labs(title="The Happiness Distribution",x="Life Ladder Score", y = "Density") + theme_minimal() + theme(legend.position="top") + geom_density(alpha=0, fill="#FB6656") +
theme(plot.title = element_text(hjust = 0.5))
Aneesh Didwania
my_graph <-ggplot(whr_alternate, aes(x = (Log.GDP.per.capita), y = Life.Ladder)) +
geom_point(aes(color = factor(Year))) +
stat_smooth(method = "lm",
col = "#C42126",
se = FALSE,
size = 1)
my_graph +
labs(
x = "GDP Per Capita",
y = "Life Ladder",
color = "Year",
title = "Relation between GDP and Happiness"
) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
Dorothy Zhu
ggplot(whr_alternate, aes(x = Healthy.life.expectancy.at.birth, y = Life.Ladder,color = Year)) +
geom_point(size=1.5,shape=9,) +
geom_smooth(method="lm",se=T)
Dorcas Cheung
ggplot(whr_alternate, aes(x=Freedom.to.make.life.choices,y=Life.Ladder,color= Year)) +
geom_point(shape=23, fill="blue", color="darkred") +
geom_smooth(method="lm")
r=cor(whr_alternate1$Freedom.to.make.life.choices,whr_alternate1$Life.Ladder)
r
[1] 0.4955001
SDy=sd(whr_alternate1$Life.Ladder)
SDx=sd(whr_alternate1$Freedom.to.make.life.choices)
SDy
[1] 1.148219
SDx
[1] 0.1218002
m=r*(SDy/SDx)
m
[1] 4.671113
lm.out =lm(Life.Ladder ~ Freedom.to.make.life.choices, data=whr_alternate1)
summary(lm.out)
Call:
lm(formula = Life.Ladder ~ Freedom.to.make.life.choices, data = whr_alternate1)
Residuals:
Min 1Q Median 3Q Max
-2.95151 -0.55260 0.05589 0.87558 1.92529
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.8180 0.5817 3.125 0.00221 **
Freedom.to.make.life.choices 4.6711 0.7295 6.403 2.74e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.001 on 126 degrees of freedom
Multiple R-squared: 0.2455, Adjusted R-squared: 0.2395
F-statistic: 41 on 1 and 126 DF, p-value: 2.74e-09
Lanjing Wang
ggplot(whr_alternate, aes (x= gini.of.household.income.reported.in.Gallup..by.wp5.year, y= Life.Ladder, color=Year))+
geom_point()+
geom_smooth(method = "lm", se= FALSE)
Year2018= whr_alternate %>%
filter(Year=="2018") %>%
select(-Democratic.Quality,-Delivery.Quality)
Year2018_=na.omit(Year2018)
r4=cor(Year2018_$Life.Ladder,Year2018_$gini.of.household.income.reported.in.Gallup..by.wp5.year)
r4
[1] -0.3839868
Year2017_=filter(whr_alternate, Year=="2017")
Year2017__=na.omit(Year2017_)
r5=cor(Year2017__$Life.Ladder,Year2017__$gini.of.household.income.reported.in.Gallup..by.wp5.year)
r5
[1] -0.4946095
Zhengan Lyu
# %>% take lefthand side as input for command on the righthand side
# Take rows that are only related to the year 2017 in the data frame whr_alternate, then name it as Year2017
Year2017= whr_alternate %>%
filter(Year=="2017")
## Take rows that are only related to the year 2018 in the data frame whr_alternate, then name it as Year2018. Since there is no data in these two columns (Democratic Quality and Delivery Quality), R uses select() to delete these two columns
Year2018= whr_alternate %>%
filter(Year=="2018") %>%
select(-Democratic.Quality,-Delivery.Quality)
#create correlation heatmap to directly see correlations between any two variables in year 2017
#remove missing values by using na.omit()
plot_correlation(Year2017 %>% na.omit())
#create correlation heatmap to directly see correlations between any two variables in year 2018
#remove missing values using na.omit()
plot_correlation(Year2018 %>% na.omit())
Member: Aneesh Didwania and Pranav Prabhas
ggplot(newdata, aes(x = GDP.PPP.Per.Capita, y = Life.Ladder, color=Life.Ladder))+ geom_point() + labs(title="Mapping Money and Happiness (r = 0.73)",
x ="GDP Per Capita (PPP)", y = "Life Ladder Score") + geom_smooth(method="loess", se=FALSE) + scale_x_continuous(labels = dollar) + theme(plot.title = element_text(hjust = 0.5), panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.background = element_rect(fill = "white",
colour = "darkblue"))
cor(newdata$GDP.PPP.Per.Capita, newdata$Life.Ladder)
[1] 0.7324451
The plot above seeks to find a relationship between the GDP of a country and its Happiness. After plotting the scatter, we conducted a polynomial regression to see how strong the association was. We arrived at an r value of 0.73. While this is indicative of a moderately strong relationship, it is not definitive. To be sure, that there was a relationship between the two variables, for the entire population, and not just our sample, we performed a Hypothesis Test.
rows = length(newdata)
happymean = mean(newdata$Life.Ladder)
happymean
[1] 5.48133
gdpmean = mean(newdata$GDP.PPP.Per.Capita)
gdpmean
[1] 21687.94
happysd = sd(newdata$Life.Ladder) * sqrt((rows-1)/rows)
happysd
[1] 1.040583
gdpsd = sd(newdata$GDP.PPP.Per.Capita) * sqrt((rows-1)/rows)
gdpsd
[1] 19306.93
r = cor(newdata$Life.Ladder, newdata$GDP.PPP.Per.Capita)
r
[1] 0.7324451
slope = r * happysd/gdpsd
slope
[1] 3.947649e-05
se = (sqrt(1-(r*r))*happysd)/(sqrt(rows - 2)*gdpsd)
se
[1] 1.834719e-05
z = slope/se
z
[1] 2.151636
pvalue=2*pnorm(-abs(z))
pvalue
[1] 0.031426
summary(lm(newdata$Life.Ladder ~ newdata$GDP.PPP.Per.Capita))
Call:
lm(formula = newdata$Life.Ladder ~ newdata$GDP.PPP.Per.Capita)
Residuals:
Min 1Q Median 3Q Max
-2.04145 -0.51128 0.00285 0.55838 1.92457
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.625e+00 9.446e-02 48.96 <2e-16 ***
newdata$GDP.PPP.Per.Capita 3.948e-05 3.124e-06 12.64 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7789 on 138 degrees of freedom
Multiple R-squared: 0.5365, Adjusted R-squared: 0.5331
F-statistic: 159.7 on 1 and 138 DF, p-value: < 2.2e-16
Member: Dorcas Cheung & Dorothy Zhu
Year2018_=na.omit(Year2018)
r1=cor(Year2018_$Healthy.life.expectancy.at.birth,Year2018_$Life.Ladder)
r1
[1] 0.7543433
SDofhh=sd(Year2018_$Healthy.life.expectancy.at.birth)
SDofhh
[1] 6.811616
SDofll=sd(Year2018_$Life.Ladder)
SDofll
[1] 1.125497
m1=r1*SDofll/SDofhh
m1
[1] 0.1246416
se1=(sqrt(1-(r^2))*SDofll)/(sqrt(110 - 2)*SDofhh)
se1
[1] 0.01082476
z1=(m1-0)/se1
z1
[1] 11.51449
1-pnorm(z1)
[1] 0
summary(lm(Year2018_$Life.Ladder~Year2018_$Healthy.life.expectancy.at.birth))
Call:
lm(formula = Year2018_$Life.Ladder ~ Year2018_$Healthy.life.expectancy.at.birth)
Residuals:
Min 1Q Median 3Q Max
-1.54896 -0.50480 0.04867 0.53385 1.60421
Coefficients:
Estimate Std. Error t value
(Intercept) -2.50559 0.68033 -3.683
Year2018_$Healthy.life.expectancy.at.birth 0.12464 0.01044 11.942
Pr(>|t|)
(Intercept) 0.000362 ***
Year2018_$Healthy.life.expectancy.at.birth < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7423 on 108 degrees of freedom
Multiple R-squared: 0.569, Adjusted R-squared: 0.565
F-statistic: 142.6 on 1 and 108 DF, p-value: < 2.2e-16
Social Support & Life Ladder Scatter Plot
Zhengan Lyu
#visualize correlation by making a scatter plot #Make an internative scatter plot for the year 2017 & 2018 separately; set "social support" as x-axis and "life ladder" as y-axis; apply regression line and label the scatter plot scatterplot1= whr_alternate %>% ggplot(aes(x=Social.support, y=Life.Ladder))+ geom_point()+ geom_smooth(method="lm", se=TRUE)+ facet_wrap(~Year)+ labs(title="Relationship between life ladder & Social support") ggplotly(scatterplot1)Individual Interpretation